# Show dataset files
import os
for dirname, _, filenames in os.walk('data'):
for filename in filenames:
print(os.path.join(dirname, filename))
data/CA_youtube_trending_data.csv data/GB_youtube_trending_data.csv data/GB_category_id.json data/US_youtube_trending_data.csv data/CA_category_id.json data/US_category_id.json
import plotly.express as px # Plotly
import plotly.graph_objects as go
import datetime
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize #package for flattening json in pandas df
from sklearn.preprocessing import StandardScaler
# Load Data
df = pd.read_csv('data/CA_youtube_trending_data.csv')
# Number of rows and columns
df.shape
(94544, 16)
# Data types of each column and non-missing rows
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 94544 entries, 0 to 94543 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 video_id 94544 non-null object 1 title 94544 non-null object 2 publishedAt 94544 non-null object 3 channelId 94544 non-null object 4 channelTitle 94544 non-null object 5 categoryId 94544 non-null int64 6 trending_date 94544 non-null object 7 tags 94544 non-null object 8 view_count 94544 non-null int64 9 likes 94544 non-null int64 10 dislikes 94544 non-null int64 11 comment_count 94544 non-null int64 12 thumbnail_link 94544 non-null object 13 comments_disabled 94544 non-null bool 14 ratings_disabled 94544 non-null bool 15 description 92891 non-null object dtypes: bool(2), int64(5), object(9) memory usage: 10.3+ MB
# Parse trending_date to datetime
df['dt_trending'] = pd.to_datetime(df['trending_date'])
df['dt_trending']
0 2020-08-12 00:00:00+00:00
1 2020-08-12 00:00:00+00:00
2 2020-08-12 00:00:00+00:00
3 2020-08-12 00:00:00+00:00
4 2020-08-12 00:00:00+00:00
...
94539 2021-11-21 00:00:00+00:00
94540 2021-11-21 00:00:00+00:00
94541 2021-11-21 00:00:00+00:00
94542 2021-11-21 00:00:00+00:00
94543 2021-11-21 00:00:00+00:00
Name: dt_trending, Length: 94544, dtype: datetime64[ns, UTC]
# Parse publish_time to datetime
df['dt_publish'] = pd.to_datetime(df['publishedAt'])
df['dt_publish']
0 2020-08-11 07:30:02+00:00
1 2020-08-11 16:34:06+00:00
2 2020-08-11 17:00:10+00:00
3 2020-08-11 19:20:14+00:00
4 2020-08-11 15:10:05+00:00
...
94539 2021-11-12 06:24:47+00:00
94540 2021-11-14 13:05:23+00:00
94541 2021-11-18 03:26:19+00:00
94542 2021-11-11 18:53:01+00:00
94543 2021-11-16 01:16:19+00:00
Name: dt_publish, Length: 94544, dtype: datetime64[ns, UTC]
# Add category titles from Json to DF
cats = pd.read_json('data/US_category_id.json')
cat_map = pd.json_normalize(cats['items'])[['id', 'snippet.title']]
py_map = {}
def create_pymap(row):
py_map[row['id']] = row['snippet.title']
return row
cat_map.apply(create_pymap, axis=1)
# Category Mapping
print(py_map)
df['cat_titles'] = df.apply(lambda x: py_map[str(x['categoryId'])], axis=1)
df['cat_titles']
{'1': 'Film & Animation', '2': 'Autos & Vehicles', '10': 'Music', '15': 'Pets & Animals', '17': 'Sports', '18': 'Short Movies', '19': 'Travel & Events', '20': 'Gaming', '21': 'Videoblogging', '22': 'People & Blogs', '23': 'Comedy', '24': 'Entertainment', '25': 'News & Politics', '26': 'Howto & Style', '27': 'Education', '28': 'Science & Technology', '29': 'Nonprofits & Activism', '30': 'Movies', '31': 'Anime/Animation', '32': 'Action/Adventure', '33': 'Classics', '34': 'Comedy', '35': 'Documentary', '36': 'Drama', '37': 'Family', '38': 'Foreign', '39': 'Horror', '40': 'Sci-Fi/Fantasy', '41': 'Thriller', '42': 'Shorts', '43': 'Shows', '44': 'Trailers'}
0 Music
1 Entertainment
2 Gaming
3 People & Blogs
4 Howto & Style
...
94539 Comedy
94540 Entertainment
94541 News & Politics
94542 Comedy
94543 Gaming
Name: cat_titles, Length: 94544, dtype: object
# Export DF for Tableau Visualizations
# df.to_csv('out/canada_trending_yt.csv')
# Browse through data
df.head()
| video_id | title | publishedAt | channelId | channelTitle | categoryId | trending_date | tags | view_count | likes | dislikes | comment_count | thumbnail_link | comments_disabled | ratings_disabled | description | dt_trending | dt_publish | cat_titles | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | KX06ksuS6Xo | Diljit Dosanjh: CLASH (Official) Music Video |... | 2020-08-11T07:30:02Z | UCZRdNleCgW-BGUJf-bbjzQg | Diljit Dosanjh | 10 | 2020-08-12T00:00:00Z | clash diljit dosanjh|diljit dosanjh|diljit dos... | 9140911 | 296541 | 6180 | 30059 | https://i.ytimg.com/vi/KX06ksuS6Xo/default.jpg | False | False | CLASH official music video performed by DILJIT... | 2020-08-12 00:00:00+00:00 | 2020-08-11 07:30:02+00:00 | Music |
| 1 | J78aPJ3VyNs | I left youtube for a month and THIS is what ha... | 2020-08-11T16:34:06Z | UCYzPXprvl5Y-Sf0g4vX-m6g | jacksepticeye | 24 | 2020-08-12T00:00:00Z | jacksepticeye|funny|funny meme|memes|jacksepti... | 2038853 | 353797 | 2628 | 40222 | https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg | False | False | I left youtube for a month and this is what ha... | 2020-08-12 00:00:00+00:00 | 2020-08-11 16:34:06+00:00 | Entertainment |
| 2 | M9Pmf9AB4Mo | Apex Legends | Stories from the Outlands – “Th... | 2020-08-11T17:00:10Z | UC0ZV6M2THA81QT9hrVWJG3A | Apex Legends | 20 | 2020-08-12T00:00:00Z | Apex Legends|Apex Legends characters|new Apex ... | 2381688 | 146740 | 2794 | 16549 | https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg | False | False | While running her own modding shop, Ramya Pare... | 2020-08-12 00:00:00+00:00 | 2020-08-11 17:00:10+00:00 | Gaming |
| 3 | 3C66w5Z0ixs | I ASKED HER TO BE MY GIRLFRIEND... | 2020-08-11T19:20:14Z | UCvtRTOMP2TqYqu51xNrqAzg | Brawadis | 22 | 2020-08-12T00:00:00Z | brawadis|prank|basketball|skits|ghost|funny vi... | 1514614 | 156914 | 5857 | 35331 | https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg | False | False | SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib... | 2020-08-12 00:00:00+00:00 | 2020-08-11 19:20:14+00:00 | People & Blogs |
| 4 | VIUo6yapDbc | Ultimate DIY Home Movie Theater for The LaBran... | 2020-08-11T15:10:05Z | UCDVPcEbVLQgLZX0Rt6jo34A | Mr. Kate | 26 | 2020-08-12T00:00:00Z | The LaBrant Family|DIY|Interior Design|Makeove... | 1123889 | 45803 | 964 | 2198 | https://i.ytimg.com/vi/VIUo6yapDbc/default.jpg | False | False | Transforming The LaBrant Family's empty white ... | 2020-08-12 00:00:00+00:00 | 2020-08-11 15:10:05+00:00 | Howto & Style |
# Weekdays
new_df = df
new_df['weekday'] = df['dt_publish'].dt.weekday
new_df['hour'] = df['dt_publish'].dt.hour
heatmap_df = new_df[['weekday', 'hour']]
heatmap_df
| weekday | hour | |
|---|---|---|
| 0 | 1 | 7 |
| 1 | 1 | 16 |
| 2 | 1 | 17 |
| 3 | 1 | 19 |
| 4 | 1 | 15 |
| ... | ... | ... |
| 94539 | 4 | 6 |
| 94540 | 6 | 13 |
| 94541 | 3 | 3 |
| 94542 | 3 | 18 |
| 94543 | 1 | 1 |
94544 rows × 2 columns
# Determine Time of Day
def categ_hours(row):
if row['hour'] >= 4 and row['hour'] < 12:
return 'Morning'
elif row['hour'] >= 12 and row['hour'] <= 20:
return 'Evening'
else:
return 'Night'
heatmap_df['time_of_day'] = heatmap_df.apply(lambda x: categ_hours(x), axis=1)
heatmap_df['time_of_day']
/var/folders/92/n74y69rx44qg1gpcmd38xs4h0000gn/T/ipykernel_43996/4067577027.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy heatmap_df['time_of_day'] = heatmap_df.apply(lambda x: categ_hours(x), axis=1)
0 Morning
1 Evening
2 Evening
3 Evening
4 Evening
...
94539 Morning
94540 Evening
94541 Night
94542 Evening
94543 Night
Name: time_of_day, Length: 94544, dtype: object
# Transform heatmap_df to numpy data
series = heatmap_df.groupby(['weekday', 'time_of_day']).size()
data = np.zeros((3, 7))
for i in ["Morning", 'Evening', 'Night']:
for j in range(7):
if i == 'Morning':
data[0][j] = series[j][i]
elif i == 'Evening':
data[1][j] = series[j][i]
else:
data[2][j] = series[j][i]
data
array([[2091., 1804., 1618., 1343., 3603., 1385., 2775.],
[8752., 9205., 7829., 7309., 7670., 7448., 9226.],
[4016., 3440., 3354., 2845., 2690., 2781., 3360.]])
The heatmap shows that most videos were published in the Evening timeframe (Noon - 8pm) with most being published on a Tuesday or Sunday evening. Within the morning hours, most videos were published on Friday and the least on Thursday or Saturday morning. For the 'graveyard' hours, most trending videos were published Monday night. One could infer that publishing a video Tuesday evening and Sunday evening will give a creator the best chance to get a trending video.
# Plot the heatmap
fig = px.imshow(data,
labels=dict(x="Day of Week", y="Time of Day", color="Productivity"),
x=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
y=['Morning', 'Evening', 'Night'],
title="Publish Date of Trending Videos in Canada"
)
fig.update_xaxes(side="top")
fig.show()
The box plot shows that trending videos normally have between 500k and 2.3M views. Videos with over 5M views tend to be an exception to the rule. We can posit that creators need their video to gain at least 500k views before it appears on the Trending Videos page.
# Distribution of View Count
fig = px.box(df, x="view_count", title="View Counts of Trending Videos in Canada", log_x=True)
fig.show()
The numerical distributions show that trending videos gain more likes than comments or dislikes. The scale of each engagement method is very different. The median amount of likes is 52K while the median dislikes is 0.8K. Comments sit slightly above dislikes at 3.3K. These medians show that creators need high like and comment counts with low dislikes to become trending.
# Like, Dislike, Comment Box Plots
fig = go.Figure()
for col in df[["likes", "dislikes", "comment_count"]]:
fig.add_trace(go.Box(y=df[col].values, name=df[col].name))
fig.update_yaxes(type="log")
fig.update_layout(
title="Comparing User Engagement Across Trending Videos",
font=dict(
family="Courier New, monospace",
size=18,
color="RebeccaPurple"
)
)
fig.show()
The data generally shows a positive correlation. Some stronger correlations seem to exist between comment_count and likes. The view_count vs likes graph in the top row seems to suggest a lower bound of people viewing the video as the number of likes increase. Also in the top row, the view_count vs comment_count graph shows two characteristics. There seems to be one set of videos that gain high views with little to no comments as can be seen with the cluster of points along the y-axis. The second characteristic is the positive correlation of comments as the number of views increases.
# Correlation Matrix of Numerical Columns
fig = px.scatter_matrix(df[["view_count", "likes", "dislikes", "comment_count"]])
fig.show()
# Load US + Great Britain (English speaking countries)
us_df = pd.read_csv('data/US_youtube_trending_data.csv')
gb_df = pd.read_csv('data/GB_youtube_trending_data.csv')
# Convert trending dates to datetime
us_df['dt_trending'] = pd.to_datetime(us_df['trending_date'])
gb_df['dt_trending'] = pd.to_datetime(gb_df['trending_date'])
# Remove duplicate videos that appear twice in one day
df_days1 = df.groupby([df['dt_trending'].dt.date, 'video_id']).agg({'view_count': 'max'}).reset_index()
df_days2 = us_df.groupby([us_df['dt_trending'].dt.date, 'video_id']).agg({'view_count': 'max'}).reset_index()
df_days3 = gb_df.groupby([gb_df['dt_trending'].dt.date, 'video_id']).agg({'view_count': 'max'}).reset_index()
df_days1
| dt_trending | video_id | view_count | |
|---|---|---|---|
| 0 | 2020-08-12 | -n8IrFPSFCU | 1085365 |
| 1 | 2020-08-12 | 0a5YhsnITFE | 155846 |
| 2 | 2020-08-12 | 0l3-iufiywU | 4807152 |
| 3 | 2020-08-12 | 1hx4vCChBlI | 232400 |
| 4 | 2020-08-12 | 1yxdYv_4h-E | 502455 |
| ... | ... | ... | ... |
| 90701 | 2021-11-21 | zQIl3K_u1wU | 1210802 |
| 90702 | 2021-11-21 | z_yxknfa8A8 | 506867 |
| 90703 | 2021-11-21 | zhRX2hsuXjM | 401584 |
| 90704 | 2021-11-21 | zpSL0CnptXk | 3411503 |
| 90705 | 2021-11-21 | zqmmeVsNIgs | 1521002 |
90706 rows × 3 columns
# View count line graph per week
tmp1 = df_days1.groupby([df_days1['dt_trending']]).agg({'view_count': 'sum'})
tmp2 = df_days2.groupby([df_days2['dt_trending']]).agg({'view_count': 'sum'})
tmp3 = df_days3.groupby([df_days3['dt_trending']]).agg({'view_count': 'sum'})
comb_df = pd.concat([tmp1, tmp2, tmp3], keys=["Canada", "United States", "Great Britain"]).reset_index()
comb_df.rename(columns={'level_0': 'country'}, inplace=True)
comb_df
| country | dt_trending | view_count | |
|---|---|---|---|
| 0 | Canada | 2020-08-12 | 421071680 |
| 1 | Canada | 2020-08-13 | 444448321 |
| 2 | Canada | 2020-08-14 | 476616821 |
| 3 | Canada | 2020-08-15 | 514613720 |
| 4 | Canada | 2020-08-16 | 521236310 |
| ... | ... | ... | ... |
| 1354 | Great Britain | 2021-11-17 | 416991569 |
| 1355 | Great Britain | 2021-11-18 | 384896101 |
| 1356 | Great Britain | 2021-11-19 | 391549974 |
| 1357 | Great Britain | 2021-11-20 | 448986296 |
| 1358 | Great Britain | 2021-11-21 | 475305409 |
1359 rows × 3 columns
The US seems to dominate views on trending videos until sometime in March where Canada starts to become the dominant source. Great Britain, for the most part, sees less views than either country. Also, Great Britain seems more closely synced with Canadian viewers than US view as can be seen in the Sep. 16th spike and July 14th / July 15th spikes. US viewers seem to have different tastes than Canadian + Great Britain viewers. Canadians also have the largest viewership spike on any given day at 1.19B views on July 4th.
fig = px.line(comb_df, x="dt_trending", y="view_count", color='country', title="Trending Videos View Count by Country")
fig.show()
# Top 5 Videos July 4th in Canada
top5_df = df[(df['dt_trending'].dt.date == datetime.date(2021, 7, 4))] \
.sort_values('view_count', ascending=False) \
.drop_duplicates(subset='video_id') \
.head(5)
top5_df
| video_id | title | publishedAt | channelId | channelTitle | categoryId | trending_date | tags | view_count | likes | ... | comment_count | thumbnail_link | comments_disabled | ratings_disabled | description | dt_trending | dt_publish | cat_titles | weekday | hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 66484 | uTQnMukAhsg | When the cheating went wrong #Shorts | 2021-06-25T00:56:11Z | UCt8z2S30Wl-GQEluFVM8NUw | Fortnite Fun TV | 24 | 2021-07-04T00:00:00Z | [None] | 137192365 | 3945725 | ... | 3609 | https://i.ytimg.com/vi/uTQnMukAhsg/default.jpg | False | False | When the cheating went wrong #ShortsHey guys! ... | 2021-07-04 00:00:00+00:00 | 2021-06-25 00:56:11+00:00 | Entertainment | 4 | 0 |
| 66420 | 5MkDvM0zvWs | The Invisible Glass Cube #shorts Experiment by... | 2021-06-20T13:00:03Z | UCmDdTOJBGCeF-iE0ADB_6Wg | FAMILY BOOMS | 24 | 2021-07-04T00:00:00Z | shorts|youtube shorts|tiktok videos|family boo... | 120289472 | 4160450 | ... | 33886 | https://i.ytimg.com/vi/5MkDvM0zvWs/default.jpg | False | False | Subscribe ❤ click ▶ https://urlgeni.us/youtube... | 2021-07-04 00:00:00+00:00 | 2021-06-20 13:00:03+00:00 | Entertainment | 6 | 13 |
| 66479 | eHmLgbTHEoY | Guess the sound challenge🎵 #shorts by Tsuriki ... | 2021-06-22T04:00:08Z | UCymK_3BWUcoYVVf5D_GmACQ | Tsuriki Show | 23 | 2021-07-04T00:00:00Z | tsuriki show|vova|anya|vovanya|tiktok compilat... | 110757167 | 3180302 | ... | 4167 | https://i.ytimg.com/vi/eHmLgbTHEoY/default.jpg | False | False | Thank you for watching.Subscribe to Tsuriki Sh... | 2021-07-04 00:00:00+00:00 | 2021-06-22 04:00:08+00:00 | Comedy | 1 | 4 |
| 66418 | PRz64kSEJqs | She is foxy but not enough #Shorts | 2021-06-29T00:47:43Z | UCt8z2S30Wl-GQEluFVM8NUw | Fortnite Fun TV | 24 | 2021-07-04T00:00:00Z | [None] | 108237341 | 3775246 | ... | 2238 | https://i.ytimg.com/vi/PRz64kSEJqs/default.jpg | False | False | She is foxy but not enough #ShortsHey guys! Wa... | 2021-07-04 00:00:00+00:00 | 2021-06-29 00:47:43+00:00 | Entertainment | 1 | 0 |
| 66506 | Fw7fbKoK3e8 | MvRyhan Funny videos #tiktok #Shorts | 2021-06-25T07:37:36Z | UCcFQLco2CA2uq9J2Uwcoi6Q | Mv Ryhan | 24 | 2021-07-04T00:00:00Z | [None] | 103975007 | 1841163 | ... | 7027 | https://i.ytimg.com/vi/Fw7fbKoK3e8/default.jpg | False | False | #shorts | 2021-07-04 00:00:00+00:00 | 2021-06-25 07:37:36+00:00 | Entertainment | 4 | 7 |
5 rows × 21 columns
All I can say is that this video somehow broke the Canadian YT viewership. It's kinda funny but mostly dumb. Humor cannot be explained :(
# The top trending video on July 4th in Canada, the day with the most views out of the three countries
from IPython.display import YouTubeVideo
YouTubeVideo(top5_df['video_id'].iloc[0]) # so dumb :(
# Scatterplot PCA'd of view count, likes, dislikes, comment_count; cat_titles as color
from sklearn.preprocessing import StandardScaler
features = ['view_count', 'likes', 'dislikes', 'comment_count']
# Separating out the features
x = df.loc[:, features].values
# Separating out the target
y = df.loc[:,['cat_titles']].values
# Standardizing the features
x = StandardScaler().fit_transform(x)
x
array([[ 0.90392498, 0.34987337, 0.22071591, 0.17296373],
[-0.09773337, 0.48761328, -0.04302468, 0.27569283],
[-0.04938069, -0.01050068, -0.03069897, 0.03640265],
...,
[-0.29722465, -0.34705364, -0.17838479, -0.13087709],
[-0.05074735, 0.2942852 , -0.00174096, -0.12090035],
[-0.35301595, -0.35184336, -0.21231763, -0.12007148]])
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1', 'principal component 2'])
principalDf
| principal component 1 | principal component 2 | |
|---|---|---|
| 0 | 0.843546 | -0.247972 |
| 1 | 0.316392 | 0.334232 |
| 2 | -0.029612 | 0.059778 |
| 3 | 0.116050 | 0.140335 |
| 4 | -0.384164 | 0.045472 |
| ... | ... | ... |
| 94539 | 9.220411 | -3.837237 |
| 94540 | 1.881429 | -1.319558 |
| 94541 | -0.487081 | 0.046670 |
| 94542 | 0.076418 | -0.037536 |
| 94543 | -0.530138 | 0.091080 |
94544 rows × 2 columns
finalDf = pd.concat([principalDf, df[['cat_titles']]], axis = 1)
finalDf
| principal component 1 | principal component 2 | cat_titles | |
|---|---|---|---|
| 0 | 0.843546 | -0.247972 | Music |
| 1 | 0.316392 | 0.334232 | Entertainment |
| 2 | -0.029612 | 0.059778 | Gaming |
| 3 | 0.116050 | 0.140335 | People & Blogs |
| 4 | -0.384164 | 0.045472 | Howto & Style |
| ... | ... | ... | ... |
| 94539 | 9.220411 | -3.837237 | Comedy |
| 94540 | 1.881429 | -1.319558 | Entertainment |
| 94541 | -0.487081 | 0.046670 | News & Politics |
| 94542 | 0.076418 | -0.037536 | Comedy |
| 94543 | -0.530138 | 0.091080 | Gaming |
94544 rows × 3 columns
The PCA attempts to show what a 4D graph of user engagement points would look like in 2D. If we were to try and predict a given trending video to a category, we could probably do it for the some of the Music category, but most of the other categories are overlapping. This shows that there isn't enough information to "accurately" categorize a given trending video to a category based on user engagement alone.
fig = px.scatter(finalDf, x="principal component 1", y="principal component 2", color="cat_titles", title="Video Categories by View Count, Likes, Dislikes, and Comment Count")
fig.show()
# Choose a subset of categories to remove
cat_to_remove = ["Music"]
selectedDf = finalDf[~finalDf['cat_titles'].isin(cat_to_remove)]
selectedDf
| principal component 1 | principal component 2 | cat_titles | |
|---|---|---|---|
| 1 | 0.316392 | 0.334232 | Entertainment |
| 2 | -0.029612 | 0.059778 | Gaming |
| 3 | 0.116050 | 0.140335 | People & Blogs |
| 4 | -0.384164 | 0.045472 | Howto & Style |
| 5 | -0.317690 | 0.100284 | Education |
| ... | ... | ... | ... |
| 94539 | 9.220411 | -3.837237 | Comedy |
| 94540 | 1.881429 | -1.319558 | Entertainment |
| 94541 | -0.487081 | 0.046670 | News & Politics |
| 94542 | 0.076418 | -0.037536 | Comedy |
| 94543 | -0.530138 | 0.091080 | Gaming |
81349 rows × 3 columns
By removing the Music category, we get a closer look at the concentration of the other categories. As we posited above, there doesn't seem to be a clear separation of categories from user engagement alone to attempt building a classifier that automatically categorized trending videos by category.
fig = px.scatter(selectedDf, x="principal component 1", y="principal component 2", color="cat_titles", title='Video Categories (excluding Music) by View Count, Likes, Dislikes, and Comment Count')
fig.show()
'Amount of information retained after PCA: %.2f%%' % (pca.explained_variance_ratio_.sum()*100)
'Amount of information retained after PCA: 88.95%'
An 89% retention of information shows that the PCA was able to keep 89% of the original variance present in the original 4D graph. With the amount of overlap, it seems doubtful that the 11% lost could help in separating the categories.